import os
import gzip
from numpy import *
from Bio import SeqIO


datasets = ("MiSeq", "HiSeq", "CAGE")

lengths = arange(1000)

for dataset in datasets:
    directory = os.path.join("/osc-fs_home/mdehoon/Data/CASPARs/", dataset)
    subdirectories = os.listdir(directory)
    for subdirectory in subdirectories:
        if subdirectory in ("Fasta", "Fastq"):
            break
    else:
        raise Exception("Failed to find sequence data for %s" % dataset)
    print("Analyzing", dataset)
    fmt = subdirectory.lower()
    subdirectory = os.path.join(directory, subdirectory)
    filenames = os.listdir(subdirectory)
    if dataset == "MiSeq":
        for readno in ("READ1", "READ2"):
            counts = zeros(1000)
            for filename in filenames:
                path = os.path.join(subdirectory, filename)
                if os.path.isdir(path):
                    continue
                terms = filename.split(".")
                assert terms[1] == "fq"
                assert terms[2] == "gz"
                if not terms[0].endswith(readno):
                    continue
                print("Reading", path)
                handle = gzip.open(path, "rt")
                records = SeqIO.parse(handle, fmt)
                for record in records:
                    sequence = record.seq
                    length = len(sequence)
                    counts[length] += 1
                handle.close()
            counts /= sum(counts)
            mean = dot(counts, lengths)
            stddev = sqrt(dot(counts, lengths**2) - mean * mean)
            c = cumsum(counts)
            index = c.searchsorted(c[-1]/2)
            median = lengths[index]
            indices = nonzero(counts)[0]
            minimum = min(indices)
            maximum = max(indices)
            print("%s, %s: median = %d, mean = %d, standard deviation = %.1f, minimum = %d, maximum = %d" % (dataset, readno, median, mean, stddev, minimum, maximum))
    else:
        counts = zeros(1000)
        for filename in filenames:
            path = os.path.join(subdirectory, filename)
            if os.path.isdir(path):
                continue
            print("Reading", path)
            if filename.endswith(".gz"):
                handle = gzip.open(path, "rt")
            else:
                handle = open(path)
            records = SeqIO.parse(handle, fmt)
            for record in records:
                sequence = record.seq
                length = len(sequence)
                counts[length] += 1
            handle.close()
        counts /= sum(counts)
        mean = dot(counts, lengths)
        stddev = sqrt(dot(counts, lengths**2) - mean * mean)
        c = cumsum(counts)
        index = c.searchsorted(c[-1]/2)
        median = lengths[index]
        indices = nonzero(counts)[0]
        minimum = min(indices)
        maximum = max(indices)
        print("%s: median = %d, mean = %d, standard deviation = %.1f, minimum = %d, maximum = %d" % (dataset, median, mean, stddev, minimum, maximum))
